inputFileName = "/data3/obukai/workspace/web-search-engine-wei/polyIRIndexer/small_set_of_gov2_documents_needed_to_be_extracted_for_training_queries_sorted_by_trecID"
inputFileHandler = open(inputFileName,"r")

# whether ignore the headline or not
# inputFileHandler.readline()

gov2SegmentList = []
basePath = "/data/jhe/trecdata/"

# step1: all the documents needed to be extracted.
for line in inputFileHandler.readlines():
    trecID = line.strip().split(" ")[4]
    trecIDElements = trecID.strip().split("-")
    gov2FolderName = trecIDElements[0]
    gov2CompressFileSegmentName = trecIDElements[1]
    compressedFilePath = basePath + gov2FolderName + "/" + gov2CompressFileSegmentName + ".gz"
    print compressedFilePath
    if compressedFilePath not in gov2SegmentList:
        gov2SegmentList.append(compressedFilePath)

print "len(gov2SegmentList):",len(gov2SegmentList)
gov2SegmentList.sort(cmp=None, key=None, reverse=False)

outputFileName = "/data3/obukai/workspace/web-search-engine-wei/gov2_files_for_typical_training_examples"
outputFileHandler = open(outputFileName,"w")
outputFileHandler.write("19" + "\n")

for path in gov2SegmentList:
    outputFileHandler.write(path + "\n")  

inputFileHandler.close()
outputFileHandler.close()